from create_base_v3 import find_md_files, split_content
import json

def save_list_to_json(string_list, file_path: str) -> None:
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(string_list, f, ensure_ascii=False, indent=2)

# 读取方法
def read_list_from_json(file_path: str):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

if __name__ == "__main__":
    file_list = find_md_files("/root/data/mindqa/docs")
    all_chunks = []
    all_sources = []
    for file_path in file_list:
        # 读取md文件
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        content_splitted = split_content(content)
        all_chunks.extend(content_splitted)
        # 重复文件名，使得文件名和chunk一一对应
        all_sources.extend([file_path] * len(content_splitted))
    print(len(file_list), "个文件")
    print(len(all_chunks),"个chunk")
    assert len(all_chunks) == len(all_sources)
    save_list_to_json(all_chunks, "/root/data/mindqa/qac_data/all.json")
    save_list_to_json(all_sources, "/root/data/mindqa/qac_data/all_sources.json")